Tidycensus
The Census Bureau only released APIs for the following time frames:
get_acs()
- American Community Survey 1-Year Data (2011-2016)
- American Community Survey 3 Year Data (2012-2013)
- American Community Survey 5-Year Data (2009-2016)
get_decennial()
- Decennial US Census (1990, 2000, 2010)
Only specific geographies are available for this method:
"us"'
"region"
"state"
"county"
"public use microdata area"
"tract"
library(tidyverse)
library(tidycensus)
# devtools::install_github("austensen/acssf")
library(acssf) # # helper functions: acs_vars(), acs_sum()
out_dir <- " " # insert path to folder in which you want to store final files
census_api_key("YOUR API KEY GOES HERE")
Selecting Variables
- Finding variables
# Pull variables
acs_variables <- load_variables(2016, "acs5")
head(acs_variables)
| B00001_001 |
Estimate!!Total |
UNWEIGHTED SAMPLE COUNT OF THE POPULATION |
| B00002_001 |
Estimate!!Total |
UNWEIGHTED SAMPLE HOUSING UNITS |
| B01001_001 |
Estimate!!Total |
SEX BY AGE |
| B01001_002 |
Estimate!!Total!!Male |
SEX BY AGE |
| B01001_003 |
Estimate!!Total!!Male!!Under 5 years |
SEX BY AGE |
| B01001_004 |
Estimate!!Total!!Male!!5 to 9 years |
SEX BY AGE |
# In R Studio, use view and filter to search for a variable
View(acs_variables)
# Search columns in a known table
acs_variables %>%
filter(str_detect(name,"B19001")) %>%
head()
| B19001_001 |
Estimate!!Total |
HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS) |
| B19001_002 |
Estimate!!Total!!Less than $10,000 |
HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS) |
| B19001_003 |
Estimate!!Total!!$10,000 to $14,999 |
HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS) |
| B19001_004 |
Estimate!!Total!!$15,000 to $19,999 |
HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS) |
| B19001_005 |
Estimate!!Total!!$20,000 to $24,999 |
HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS) |
| B19001_006 |
Estimate!!Total!!$25,000 to $29,999 |
HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS) |
# Search for a know variable using a keyword
acs_variables %>%
filter(str_detect(label,fixed("race", ignore_case = TRUE))) %>%
head()
| B02001_007 |
Estimate!!Total!!Some other race alone |
RACE |
| B02001_008 |
Estimate!!Total!!Two or more races |
RACE |
| B02001_009 |
Estimate!!Total!!Two or more races!!Two races including Some other race |
RACE |
| B02001_010 |
Estimate!!Total!!Two or more races!!Two races excluding Some other race, and three or more races |
RACE |
| B03002_008 |
Estimate!!Total!!Not Hispanic or Latino!!Some other race alone |
HISPANIC OR LATINO ORIGIN BY RACE |
| B03002_009 |
Estimate!!Total!!Not Hispanic or Latino!!Two or more races |
HISPANIC OR LATINO ORIGIN BY RACE |
- Selecting variables
vars1 <- acs_vars("B17010B_{c(4, 11, 17, 24, 31, 37)*}")
vars2 <- acs_vars("B01001_{c(3:6, 27:30)*}")
Getting Data
- Source
# Decennial
get_decennial(geography = "state",
variables = c(Population = "P001001"),
year = 2010,
output = "wide") %>%
head(5)
## Getting data from the 2010 decennial Census
| 01 |
Alabama |
4779736 |
| 02 |
Alaska |
710231 |
| 04 |
Arizona |
6392017 |
| 05 |
Arkansas |
2915918 |
| 06 |
California |
37253956 |
# American Community Survey
get_acs(geography = "tract",
state = "NY",
variables = c(Female_Newborn = "B01001_027"),
year = 2016,
survey = "acs5", # acs5 = five-year estimates, acs1= one-year estimates
output = "wide") %>%
rename(Female_Newborn_Estimate = Female_NewbornE,
Female_Newborn_MOE = Female_NewbornM) %>%
head(5)
## Getting data from the 2012-2016 5-year ACS
| 36001000100 |
Census Tract 1, Albany County, New York |
87 |
54 |
| 36001000200 |
Census Tract 2, Albany County, New York |
170 |
86 |
| 36001000300 |
Census Tract 3, Albany County, New York |
241 |
104 |
| 36001000401 |
Census Tract 4.01, Albany County, New York |
71 |
44 |
| 36001000403 |
Census Tract 4.03, Albany County, New York |
170 |
94 |
- Whole Table
# Getting the whole table
get_acs(geography = "county",
state = "NY",
table = "B19001",
year = 2016,
survey = "acs5",
output = "wide") %>%
head(3)
## Getting data from the 2012-2016 5-year ACS
## Loading ACS5 variables for 2016 from table B19001. To cache this dataset for faster access to ACS tables in the future, run this function with `cache_table = TRUE`. You only need to do this once per ACS dataset.
| 36001 |
Albany County, New York |
124108 |
805 |
7606 |
600 |
6285 |
524 |
5578 |
517 |
5929 |
639 |
5026 |
419 |
5819 |
566 |
4858 |
460 |
5379 |
444 |
4663 |
527 |
9980 |
629 |
12530 |
723 |
16017 |
756 |
11693 |
641 |
7773 |
467 |
8093 |
576 |
6879 |
490 |
| 36003 |
Allegany County, New York |
18032 |
295 |
1282 |
173 |
1193 |
156 |
1194 |
157 |
1114 |
135 |
988 |
120 |
1122 |
119 |
1113 |
129 |
1209 |
152 |
874 |
112 |
1605 |
155 |
1916 |
164 |
2277 |
196 |
1103 |
114 |
501 |
84 |
320 |
60 |
221 |
48 |
| 36005 |
Bronx County, New York |
490740 |
1515 |
76719 |
1775 |
45398 |
1629 |
36856 |
1357 |
30086 |
1030 |
27848 |
995 |
26839 |
1108 |
24339 |
1193 |
22364 |
1118 |
18635 |
1096 |
34602 |
1184 |
39721 |
1432 |
44111 |
1429 |
26552 |
1007 |
14395 |
880 |
12865 |
638 |
9410 |
591 |
- Iterating
# Over multiple states
get_acs(geography = "tract",
state = c("NY", "CT", "NJ", "PA", "MA", "RI"),
variables = vars2,
year = 2016,
survey = "acs5",
output = "wide") %>%
head(3)
## Getting data from the 2012-2016 5-year ACS
## Getting data from the 2012-2016 5-year ACS
## Getting data from the 2012-2016 5-year ACS
## Getting data from the 2012-2016 5-year ACS
## Getting data from the 2012-2016 5-year ACS
## Getting data from the 2012-2016 5-year ACS
## Getting data from the 2012-2016 5-year ACS
| 36001000100 |
Census Tract 1, Albany County, New York |
90 |
58 |
46 |
45 |
52 |
41 |
58 |
38 |
87 |
54 |
76 |
53 |
97 |
63 |
42 |
38 |
| 36001000200 |
Census Tract 2, Albany County, New York |
159 |
113 |
100 |
71 |
78 |
74 |
52 |
48 |
170 |
86 |
127 |
108 |
142 |
106 |
0 |
11 |
| 36001000300 |
Census Tract 3, Albany County, New York |
165 |
101 |
192 |
110 |
177 |
104 |
149 |
132 |
241 |
104 |
105 |
145 |
171 |
143 |
47 |
63 |
# Over multiple years
map_dfr(2012:2016, function(x) {
get_acs(geography = "state",
state = "NY",
variables = "B03002_001",
year = x,
survey = "acs5") %>%
mutate(year = x)})
## Getting data from the 2008-2012 5-year ACS
## Getting data from the 2009-2013 5-year ACS
## Getting data from the 2010-2014 5-year ACS
## Getting data from the 2011-2015 5-year ACS
## Getting data from the 2012-2016 5-year ACS
| 36 |
New York |
B03002_001 |
19398125 |
NA |
2012 |
| 36 |
New York |
B03002_001 |
19487053 |
NA |
2013 |
| 36 |
New York |
B03002_001 |
19594330 |
NA |
2014 |
| 36 |
New York |
B03002_001 |
19673174 |
NA |
2015 |
| 36 |
New York |
B03002_001 |
19697457 |
NA |
2016 |
Wrangling Data
- Aggregating
get_acs(geography = "tract",
state = "NY",
variables = vars2,
year = 2016,
survey = "acs5",
output = "wide") %>%
mutate(pop_u18_total = acs_sum("B01001_{c(3:6, 27:30)*}E"),
pop_u18_male = acs_sum("B01001_{c(3:6)*}E"),
pop_u18_female = acs_sum("B01001_{c(27:30)*}E")) %>%
select(GEOID, pop_u18_total, pop_u18_male, pop_u18_female) %>%
head(3)
## Getting data from the 2012-2016 5-year ACS
| 36001000100 |
548 |
246 |
302 |
| 36001000200 |
828 |
389 |
439 |
| 36001000300 |
1247 |
683 |
564 |
- Normalizing
race_vars <- c(White = "B03002_003", Black = "B03002_004", Native = "B03002_005",
Asian = "B03002_006", HIPI = "B03002_007", Hispanic = "B03002_012")
total_pop <- "B03002_001"
ny_acs_income_data <- get_acs(geography = "county",
state = "NY",
variables = race_vars,
summary_var = total_pop,
year = 2016,
survey = "acs5")
ny_acs_income_data %>%
mutate(pct = 100 * (estimate / summary_est)) %>%
head(3)
| 36001 |
Albany County, New York |
White |
226677 |
292 |
307891 |
NA |
73.622483 |
| 36001 |
Albany County, New York |
Black |
36350 |
726 |
307891 |
NA |
11.806126 |
| 36001 |
Albany County, New York |
Native |
329 |
94 |
307891 |
NA |
0.106856 |
- Largest variable
ny_acs_income_data %>%
group_by(GEOID) %>%
filter(estimate == max(estimate)) %>%
head()
| 36001 |
Albany County, New York |
White |
226677 |
292 |
307891 |
NA |
| 36003 |
Allegany County, New York |
White |
45116 |
40 |
47700 |
NA |
| 36005 |
Bronx County, New York |
Hispanic |
796193 |
NA |
1436785 |
NA |
| 36007 |
Broome County, New York |
White |
166815 |
90 |
197381 |
NA |
| 36009 |
Cattaraugus County, New York |
White |
71489 |
41 |
78506 |
NA |
| 36011 |
Cayuga County, New York |
White |
71408 |
85 |
78783 |
NA |
ny_acs_income_data %>%
group_by(GEOID) %>%
filter(estimate == max(estimate)) %>%
group_by(variable) %>%
tally()
- Case counts
ny_acs_income_data %>%
filter(variable != "B19001_001") %>% #"B19001_001" is the total count
mutate(incgroup = case_when(
variable < "B19001_008" ~ "below35k",
variable < "B19001_013" ~ "35kto75k",
TRUE ~ "above75k")) %>%
group_by(NAME, incgroup) %>%
summarize(group_est = sum(estimate)) %>%
head(3)
| Albany County, New York |
above75k |
280590 |
| Albany County, New York |
below35k |
18567 |
| Allegany County, New York |
above75k |
46659 |
ACS Margins of Error
Whereas the Decennial census is a literal census (or as close to one as possible), the ACS is based on a sampling. As such, the numbers are really estimates which also have margins of error because they are not literal counts.
- Scale of margin of error
ny_elder_poverty <- get_acs(geography = "tract",
state = "NY",
variables = c(male = "B17001_016", female = "B17001_030"),
year = 2016,
survey = "acs5")
moe_check <- ny_elder_poverty %>%
filter(moe > estimate)
nrow(moe_check) / nrow(ny_elder_poverty)
## [1] 0.8294022
- Preserving MOE when using functions
# Adding variables
moe_sum(moe = c(55, 33, 44, 12, 4),
estimate = c(1000, 125, 300, 54, 12))
## [1] 78.80355
# Multiplying variables
moe_product(est1 = 55,
est2 = 33,
moe1 = 12,
moe2 = 9)
## [1] 633.9093
# Finding the ration between variables
moe_ratio(num = 1000,
denom = 950,
moe_num = 200,
moe_denom = 177)
## [1] 0.287724
# Findthing the proportion of one variable by another
moe_prop(num = 374,
denom = 1200,
moe_num = 122,
moe_denom = 333)
## [1] 0.05344178
# Example
ny_elder_poverty <- ny_elder_poverty %>%
group_by(GEOID) %>%
summarize(estmf = sum(estimate),
moemf = moe_sum(moe = moe, estimate = estimate))
moe_check <- ny_elder_poverty %>%
filter(moemf > estmf)
nrow(moe_check) / nrow(ny_elder_poverty)
## [1] 0.6520943
Plotting
ny_county_income <- get_acs(geography = "county",
state = "NY",
variables = c(hhincome = "B19013_001"),
year = 2016,
survey = "acs5") %>%
mutate(NAME = str_replace(NAME, " County, New York", "")) %>%
filter(NAME %in% c("Kings", "Queens", "New York", "Bronx", "Richmond", "Westchester", "Nassau", "Albany"))
ggplot(ny_county_income, aes(x = estimate, y = reorder(NAME, estimate))) +
geom_errorbarh(aes(xmin = estimate - moe, xmax = estimate + moe)) +
labs(title = "Median Household Income by County",
x = "ACS estimate (bars represent margins of error)",
y = "") +
scale_x_continuous(labels = scales::dollar)

Saving Data
write_csv(dataset, str_glue("{out_dir}/dataset.csv"))
Visualization
ggplot(ny_counties, aes(fill = B03002_001E)) +
geom_sf() +
scale_fill_viridis_c(name = "Population")

ggplot(ny_counties, aes(fill = B03002_001E, color = B03002_001E)) +
geom_sf() +
scale_fill_viridis_c(name = "Population")+
scale_color_viridis_c(guide = FALSE) +
theme_minimal() +
coord_sf(datum = NA) +
labs(title = "Population by County",
caption = "Data source: 2012-2016 ACS.\nData acquired with the R tidycensus package.")

race_vars <- c(White = "B03002_003", Black = "B03002_004", Asian = "B03002_006", Hispanic = "B03002_012")
total_pop <- "B03002_001"
nyc_race_data <- get_acs(geography = "tract",
state = "NY",
county = c("Bronx", "New York", "Queens", "Kings", "Richmond"),
variables = race_vars,
summary_var = total_pop,
year = 2016,
survey = "acs5") %>%
mutate(pct = 100 * (estimate / summary_est))
nyc_tracts <- tracts(state = "NY", county = c("Bronx", "New York", "Queens", "Kings", "Richmond"), cb = TRUE)
nyc_tracts_race <- left_join(nyc_tracts, nyc_race_data, by = "GEOID")
ggplot(nyc_tracts_race, aes(fill = pct, color = pct)) +
geom_sf() +
scale_fill_viridis_c(name = "Percent of Population", na.value = "white")+
scale_color_viridis_c(guide = FALSE, na.value = "white") +
theme_minimal() +
coord_sf(datum = NA) +
facet_wrap(~variable)

library(mapview)
data_map <- ny_counties %>%
select(NAME.x, B03002_001E, geometry) %>%
rename(Population = B03002_001E,
County = NAME.x)
m <- mapview(data_map,
zcol = "Population",
legend = TRUE)
m@map
library(sf)
nyc_dots <- map(c("White", "Black", "Hispanic", "Asian"), function(group) {
nyc_tracts_race %>%
filter(variable == group) %>%
st_sample(., size = .$estimate / 100) %>%
st_sf() %>%
mutate(group = group)
}) %>%
reduce(rbind) %>%
group_by(group) %>%
summarize()
nyc_boundary <- counties("NY", cb = TRUE) %>%
filter(NAME %in% c("Bronx", "New York", "Queens", "Kings", "Richmond"))
ggplot() +
geom_sf(data = nyc_boundary, color = "black", fill = "white") +
geom_sf(data = nyc_dots, aes(color = group, fill = group), size = 0.1) +
coord_sf(datum = NA) +
scale_color_brewer(palette = "Set1", guide = FALSE) +
scale_fill_brewer(palette = "Set1") +
labs(title = "The Racial Geography of NYC",
subtitle = "2011-2016 ACS",
fill = "",
caption = "1 dot = approximately 1,000 people.\nData acquired with the R tidycensus and tigris packages.")
